Variational Inference: Bayesian Neural Networks

Following https://docs.pymc.io/notebooks/bayesian_neural_network_advi.html to learn


In [24]:
%matplotlib inline
import theano
floatX = theano.config.floatX
import pymc3 as pm
import theano.tensor as T
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
sns.set(font_scale=1.5)
sns.set_style('white')
from sklearn import datasets
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import matplotlib
import tqdm
import pandas as pd

In [2]:
X, Y = make_moons(noise=0.2, random_state=0, n_samples=1000)
X = scale(X)
X = X.astype(floatX)
Y = Y.astype(floatX)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5)

In [3]:
fig, ax = plt.subplots(figsize=(10,7))
ax.scatter(X[Y==0, 0], X[Y==0, 1], label='Class 0')
ax.scatter(X[Y==1, 0], X[Y==1, 1], color='r', label='Class 1')
sns.despine(); ax.legend()
ax.set(xlabel='X', ylabel='Y', title='Toy binary classification data set');


A neural network is quite simple. The basic unit is a perceptron which is nothing more than logistic regression. We use many of these in parallel and then stack them up to get hidden layers. Here we will use 2 hidden layers with 5 neurons each which is sufficient for such a simple problem.

2 hidden layers with 5 neurons each


In [4]:
def construct_nn(ann_input, ann_output):
    n_hidden = 5

    # Initialize random weights between each layer
    init_1 = np.random.randn(X.shape[1], n_hidden).astype(floatX)
    init_2 = np.random.randn(n_hidden, n_hidden).astype(floatX)
    init_out = np.random.randn(n_hidden).astype(floatX)

    with pm.Model() as neural_network:
        # Trick: Turn inputs and outputs into shared variables using the data container pm.Data
        # It's still the same thing, but we can later change the values of the shared variable
        # (to switch in the test-data later) and pymc3 will just use the new data.
        # Kind-of like a pointer we can redirect.
        # For more info, see: http://deeplearning.net/software/theano/library/compile/shared.html
        ann_input = pm.Data('ann_input', X_train)
        ann_output = pm.Data('ann_output', Y_train)

        # Weights from input to hidden layer
        weights_in_1 = pm.Normal('w_in_1', 0, sigma=1,
                                 shape=(X.shape[1], n_hidden),
                                 testval=init_1)

        # Weights from 1st to 2nd layer
        weights_1_2 = pm.Normal('w_1_2', 0, sigma=1,
                                shape=(n_hidden, n_hidden),
                                testval=init_2)

        # Weights from hidden layer to output
        weights_2_out = pm.Normal('w_2_out', 0, sigma=1,
                                  shape=(n_hidden,),
                                  testval=init_out)

        # Build neural-network using tanh activation function
        act_1 = pm.math.tanh(pm.math.dot(ann_input,
                                         weights_in_1))
        act_2 = pm.math.tanh(pm.math.dot(act_1,
                                         weights_1_2))
        act_out = pm.math.sigmoid(pm.math.dot(act_2,
                                              weights_2_out))

        # Binary classification -> Bernoulli likelihood
        out = pm.Bernoulli('out',
                           act_out,
                           observed=ann_output,
                           total_size=Y_train.shape[0] # IMPORTANT for minibatches
                          )
    return neural_network

neural_network = construct_nn(X_train, Y_train)


WARNING (theano.gof.compilelock): Overriding existing lock by dead process '17416' (I am process '11588')

In [5]:
from pymc3.theanof import set_tt_rng, MRG_RandomStreams
set_tt_rng(MRG_RandomStreams(42))

In [6]:
%%time

with neural_network:
    inference = pm.ADVI()
    approx = pm.fit(n=30000, method=inference)


Average Loss = 198.63: 100%|███████████████████████████████████████████████████| 30000/30000 [00:19<00:00, 1533.22it/s]
Finished [100%]: Average Loss = 198.54
Wall time: 30.8 s

In [7]:
plt.plot(-inference.hist, label='new ADVI', alpha=.3)
plt.plot(approx.hist, label='old ADVI', alpha=.3)
plt.legend()
plt.ylabel('ELBO')
plt.xlabel('iteration');



In [8]:
trace = approx.sample(draws=5000)

In [9]:
neural_network.out.distribution.p


Out[9]:
sigmoid.0

In [10]:
# create symbolic input
x = T.matrix('X')
# symbolic number of samples is supported, we build vectorized posterior on the fly
n = T.iscalar('n')
# Do not forget test_values or set theano.config.compute_test_value = 'off'
x.tag.test_value = np.empty_like(X_train[:10])
n.tag.test_value = 100
_sample_proba = approx.sample_node(neural_network.out.distribution.p,
                                   size=n,
                                   more_replacements={neural_network['ann_input']: x})
# It is time to compile the function
# No updates are needed for Approximation random generator
# Efficient vectorized form of sampling is used
sample_proba = theano.function([x, n], _sample_proba)

# Create bechmark functions
def production_step1():
    pm.set_data(new_data={'ann_input': X_test, 'ann_output': Y_test}, model=neural_network)
    ppc = pm.sample_posterior_predictive(trace, samples=500, progressbar=False, model=neural_network)

    # Use probability of > 0.5 to assume prediction of class 1
    pred = ppc['out'].mean(axis=0) > 0.5

def production_step2():
    sample_proba(X_test, 500).mean(0) > 0.5

In [11]:
%timeit production_step1()


4.71 s ± 130 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [12]:
%timeit production_step2()


61 ms ± 741 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [13]:
pred = sample_proba(X_test, 500).mean(0) > 0.5

In [14]:
fig, ax = plt.subplots()
ax.scatter(X_test[pred==0, 0], X_test[pred==0, 1])
ax.scatter(X_test[pred==1, 0], X_test[pred==1, 1], color='r')
sns.despine()
ax.set(title='Predicted labels in testing set', xlabel='X', ylabel='Y');



In [15]:
print('Accuracy = {}%'.format((Y_test == pred).mean() * 100))


Accuracy = 87.2%

In [16]:
grid = pm.floatX(np.mgrid[-3:3:100j,-3:3:100j])
grid_2d = grid.reshape(2, -1).T
dummy_out = np.ones(grid.shape[1], dtype=np.int8)

In [17]:
ppc = sample_proba(grid_2d ,500)

In [18]:
# cmap = sns.diverging_palette(250, 12, s=85, l=25, as_cmap=True)
plt.set_cmap('bwr')
fig, ax = plt.subplots(figsize=(16, 9))
contour = ax.contourf(grid[0], grid[1], ppc.mean(axis=0).reshape(100, 100),)
ax.scatter(X_test[pred==0, 0], X_test[pred==0, 1])
ax.scatter(X_test[pred==1, 0], X_test[pred==1, 1], color='r')
cbar = plt.colorbar(contour, ax=ax)
_ = ax.set(xlim=(-3, 3), ylim=(-3, 3), xlabel='X', ylabel='Y');
cbar.ax.set_ylabel('Posterior predictive mean probability of class label = 0');


<Figure size 432x288 with 0 Axes>

Now setup to allow for parameter changes


In [19]:
def construct_nn(ann_input, ann_output, n_hidden=5):

    # Initialize random weights between each layer
    init_1 = np.random.randn(X.shape[1], n_hidden).astype(floatX)
    init_2 = np.random.randn(n_hidden, n_hidden).astype(floatX)
    init_out = np.random.randn(n_hidden).astype(floatX)

    with pm.Model() as neural_network:
        # Trick: Turn inputs and outputs into shared variables using the data container pm.Data
        # It's still the same thing, but we can later change the values of the shared variable
        # (to switch in the test-data later) and pymc3 will just use the new data.
        # Kind-of like a pointer we can redirect.
        # For more info, see: http://deeplearning.net/software/theano/library/compile/shared.html
        ann_input = pm.Data('ann_input', X_train)
        ann_output = pm.Data('ann_output', Y_train)

        # Weights from input to hidden layer
        weights_in_1 = pm.Normal('w_in_1', 0, sigma=1,
                                 shape=(X.shape[1], n_hidden),
                                 testval=init_1)

        # Weights from 1st to 2nd layer
        weights_1_2 = pm.Normal('w_1_2', 0, sigma=1,
                                shape=(n_hidden, n_hidden),
                                testval=init_2)

        # Weights from hidden layer to output
        weights_2_out = pm.Normal('w_2_out', 0, sigma=1,
                                  shape=(n_hidden,),
                                  testval=init_out)

        # Build neural-network using tanh activation function
        act_1 = pm.math.tanh(pm.math.dot(ann_input,
                                         weights_in_1))
        act_2 = pm.math.tanh(pm.math.dot(act_1,
                                         weights_1_2))
        act_out = pm.math.sigmoid(pm.math.dot(act_2,
                                              weights_2_out))

        # Binary classification -> Bernoulli likelihood
        out = pm.Bernoulli('out',
                           act_out,
                           observed=ann_output,
                           total_size=Y_train.shape[0] # IMPORTANT for minibatches
                          )
    return neural_network

neural_network = construct_nn(X_train, Y_train)

In [20]:
from pymc3.theanof import set_tt_rng, MRG_RandomStreams
set_tt_rng(MRG_RandomStreams(42))

with neural_network:
    inference = pm.ADVI()
    approx = pm.fit(n=30000, method=inference)

    plt.plot(-inference.hist, label='new ADVI', alpha=.3)
plt.plot(approx.hist, label='old ADVI', alpha=.3)
plt.legend()
plt.ylabel('ELBO')
plt.xlabel('iteration');
    
    
trace = approx.sample(draws=5000)



# create symbolic input
x = T.matrix('X')
# symbolic number of samples is supported, we build vectorized posterior on the fly
n = T.iscalar('n')
# Do not forget test_values or set theano.config.compute_test_value = 'off'
x.tag.test_value = np.empty_like(X_train[:10])
n.tag.test_value = 100
_sample_proba = approx.sample_node(neural_network.out.distribution.p,
                                   size=n,
                                   more_replacements={neural_network['ann_input']: x})
# It is time to compile the function
# No updates are needed for Approximation random generator
# Efficient vectorized form of sampling is used
sample_proba = theano.function([x, n], _sample_proba)

# Create bechmark functions
def production_step1():
    pm.set_data(new_data={'ann_input': X_test, 'ann_output': Y_test}, model=neural_network)
    ppc = pm.sample_posterior_predictive(trace, samples=500, progressbar=False, model=neural_network)

    # Use probability of > 0.5 to assume prediction of class 1
    pred = ppc['out'].mean(axis=0) > 0.5

def production_step2():
    sample_proba(X_test, 500).mean(0) > 0.5

production_step1()
production_step2()

pred = sample_proba(X_test, 500).mean(0) > 0.5


fig, ax = plt.subplots()
ax.scatter(X_test[pred==0, 0], X_test[pred==0, 1])
ax.scatter(X_test[pred==1, 0], X_test[pred==1, 1], color='r')
sns.despine()
ax.set(title='Predicted labels in testing set', xlabel='X', ylabel='Y');

print('Accuracy = {}%'.format((Y_test == pred).mean() * 100))

grid = pm.floatX(np.mgrid[-3:3:100j,-3:3:100j])
grid_2d = grid.reshape(2, -1).T
dummy_out = np.ones(grid.shape[1], dtype=np.int8)

ppc = sample_proba(grid_2d ,500)



# cmap = sns.diverging_palette(250, 12, s=85, l=25, as_cmap=True)
plt.set_cmap('bwr')
fig, ax = plt.subplots(figsize=(16, 9))
contour = ax.contourf(grid[0], grid[1], ppc.mean(axis=0).reshape(100, 100),)
ax.scatter(X_test[pred==0, 0], X_test[pred==0, 1])
ax.scatter(X_test[pred==1, 0], X_test[pred==1, 1], color='r')
cbar = plt.colorbar(contour, ax=ax)
_ = ax.set(xlim=(-3, 3), ylim=(-3, 3), xlabel='X', ylabel='Y');
cbar.ax.set_ylabel('Posterior predictive mean probability of class label = 0');


Average Loss = 146.28: 100%|███████████████████████████████████████████████████| 30000/30000 [00:26<00:00, 1122.26it/s]
Finished [100%]: Average Loss = 146.38
Accuracy = 94.6%

Accuracy by n_hidden


In [21]:
def run_this(n_hidden=5):
    neural_network = construct_nn(X_train, Y_train, n_hidden=n_hidden)
    
    from pymc3.theanof import set_tt_rng, MRG_RandomStreams
    set_tt_rng(MRG_RandomStreams(42))

    with neural_network:
        inference = pm.ADVI()
        approx = pm.fit(n=30000, method=inference)

        plt.plot(-inference.hist, label='new ADVI', alpha=.3)
    plt.plot(approx.hist, label='old ADVI', alpha=.3)
    plt.legend()
    plt.ylabel('ELBO')
    plt.xlabel('iteration');


    trace = approx.sample(draws=5000)



    # create symbolic input
    x = T.matrix('X')
    # symbolic number of samples is supported, we build vectorized posterior on the fly
    n = T.iscalar('n')
    # Do not forget test_values or set theano.config.compute_test_value = 'off'
    x.tag.test_value = np.empty_like(X_train[:10])
    n.tag.test_value = 100
    _sample_proba = approx.sample_node(neural_network.out.distribution.p,
                                       size=n,
                                       more_replacements={neural_network['ann_input']: x})
    # It is time to compile the function
    # No updates are needed for Approximation random generator
    # Efficient vectorized form of sampling is used
    sample_proba = theano.function([x, n], _sample_proba)

    # Create bechmark functions
    def production_step1():
        pm.set_data(new_data={'ann_input': X_test, 'ann_output': Y_test}, model=neural_network)
        ppc = pm.sample_posterior_predictive(trace, samples=500, progressbar=False, model=neural_network)

        # Use probability of > 0.5 to assume prediction of class 1
        pred = ppc['out'].mean(axis=0) > 0.5

    def production_step2():
        sample_proba(X_test, 500).mean(0) > 0.5

    production_step1()
    production_step2()

    pred = sample_proba(X_test, 500).mean(0) > 0.5


    fig, ax = plt.subplots()
    ax.scatter(X_test[pred==0, 0], X_test[pred==0, 1])
    ax.scatter(X_test[pred==1, 0], X_test[pred==1, 1], color='r')
    sns.despine()
    ax.set(title='Predicted labels in testing set', xlabel='X', ylabel='Y');

    print('Accuracy = {}%'.format((Y_test == pred).mean() * 100))
    return (Y_test == pred).mean() * 100

In [22]:
ans = []
for i in range(1, 20):
    print(i)
    ans.append(run_this(i))


1
Average Loss = 170.41: 100%|███████████████████████████████████████████████████| 30000/30000 [00:24<00:00, 1214.40it/s]
Finished [100%]: Average Loss = 170.41
Accuracy = 87.0%
2
Average Loss = 113: 100%|██████████████████████████████████████████████████████| 30000/30000 [00:28<00:00, 1060.68it/s]
Finished [100%]: Average Loss = 112.99
Accuracy = 96.0%
3
Average Loss = 128.59: 100%|███████████████████████████████████████████████████| 30000/30000 [00:29<00:00, 1020.41it/s]
Finished [100%]: Average Loss = 128.57
Accuracy = 94.8%
4
Average Loss = 124.48: 100%|████████████████████████████████████████████████████| 30000/30000 [00:35<00:00, 839.13it/s]
Finished [100%]: Average Loss = 124.52
Accuracy = 96.2%
5
Average Loss = 146.67: 100%|████████████████████████████████████████████████████| 30000/30000 [00:32<00:00, 930.88it/s]
Finished [100%]: Average Loss = 146.73
Accuracy = 95.0%
6
Average Loss = 173.92: 100%|████████████████████████████████████████████████████| 30000/30000 [00:34<00:00, 878.01it/s]
Finished [100%]: Average Loss = 173.8
Accuracy = 93.2%
7
Average Loss = 176.66: 100%|████████████████████████████████████████████████████| 30000/30000 [00:34<00:00, 864.61it/s]
Finished [100%]: Average Loss = 176.6
Accuracy = 92.80000000000001%
8
Average Loss = 177.69: 100%|████████████████████████████████████████████████████| 30000/30000 [00:33<00:00, 905.87it/s]
Finished [100%]: Average Loss = 177.68
Accuracy = 93.4%
9
Average Loss = 185.29: 100%|████████████████████████████████████████████████████| 30000/30000 [00:39<00:00, 752.49it/s]
Finished [100%]: Average Loss = 185.58
Accuracy = 92.60000000000001%
10
Average Loss = 203.71: 100%|████████████████████████████████████████████████████| 30000/30000 [00:40<00:00, 749.74it/s]
Finished [100%]: Average Loss = 203.63
Accuracy = 91.8%
11
Average Loss = 213.65: 100%|████████████████████████████████████████████████████| 30000/30000 [00:36<00:00, 816.07it/s]
Finished [100%]: Average Loss = 213.67
Accuracy = 90.4%
12
Average Loss = 223.63: 100%|████████████████████████████████████████████████████| 30000/30000 [00:30<00:00, 971.66it/s]
Finished [100%]: Average Loss = 223.76
Accuracy = 89.8%
13
Average Loss = 235.99: 100%|████████████████████████████████████████████████████| 30000/30000 [00:33<00:00, 904.05it/s]
Finished [100%]: Average Loss = 235.95
Accuracy = 88.0%
14
Average Loss = 223.86: 100%|████████████████████████████████████████████████████| 30000/30000 [00:35<00:00, 836.64it/s]
Finished [100%]: Average Loss = 224.89
Accuracy = 91.4%
15
Average Loss = 248.14: 100%|████████████████████████████████████████████████████| 30000/30000 [00:35<00:00, 854.00it/s]
Finished [100%]: Average Loss = 248.37
Accuracy = 87.6%
16
Average Loss = 255.71: 100%|████████████████████████████████████████████████████| 30000/30000 [00:36<00:00, 816.30it/s]
Finished [100%]: Average Loss = 255.4
Accuracy = 88.0%
17
Average Loss = 256.89: 100%|████████████████████████████████████████████████████| 30000/30000 [00:38<00:00, 771.58it/s]
Finished [100%]: Average Loss = 256.9
Accuracy = 88.8%
18
Average Loss = 253.72: 100%|████████████████████████████████████████████████████| 30000/30000 [00:37<00:00, 805.68it/s]
Finished [100%]: Average Loss = 253.13
Accuracy = 88.6%
19
Average Loss = 260.39: 100%|████████████████████████████████████████████████████| 30000/30000 [00:39<00:00, 768.01it/s]
Finished [100%]: Average Loss = 260.22
Accuracy = 88.0%

In [25]:
acc = pd.DataFrame(ans)
acc


Out[25]:
0
0 87.0
1 96.0
2 94.8
3 96.2
4 95.0
5 93.2
6 92.8
7 93.4
8 92.6
9 91.8
10 90.4
11 89.8
12 88.0
13 91.4
14 87.6
15 88.0
16 88.8
17 88.6
18 88.0

In [26]:
acc.plot()


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x24c7f59cac8>

In [ ]: